options(java.parameters = "-Xmx2048m",
stringsAsFactors = FALSE,
encoding = 'UTF-8')
suppressPackageStartupMessages({
# ISLR2
library(ISLR2)
# Splines
library(splines)
# DM
library(zip)
library(openxlsx)
library(readxl)
library(writexl)
library(RcppRoll)
library(plyr)
library(stringi)
library(feather)
library(RODBC)
library(MASS)
library(car)
library(caret)
library(data.table)
library(lubridate)
library(plotly)
library(pROC)
library(tidymodels)
library(tidyverse)
})
Select particular variables of Boston data.
boston.subset <- Boston %>%
select(dis, nox) %>%
arrange(dis)
head(boston.subset)
## dis nox
## 1 1.1296 0.668
## 2 1.1370 0.668
## 3 1.1691 0.631
## 4 1.1742 0.668
## 5 1.1781 0.659
## 6 1.2024 0.631
Fit.
boston.poly <- lm(nox ~ poly(dis, degree = 3), data = boston.subset)
summary(boston.poly)
##
## Call:
## lm(formula = nox ~ poly(dis, degree = 3), data = boston.subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, degree = 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, degree = 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, degree = 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
Plot.
boston.poly.pred <- boston.subset %>%
mutate(nox_pred = boston.poly$fitted.values)
boston.poly.plot <- plot_ly(data = boston.poly.pred, x = ~dis) %>%
add_trace(y = ~nox, name = 'True', type = 'scatter', mode = 'markers') %>%
add_trace(y = ~nox_pred, name = 'Fit', type = 'scatter', mode = 'lines') %>%
layout(
title = 'Polynomial Fits'
)
boston.poly.plot
Fit.
boston.poly.list <- list()
for (i in 1:10) {
boston.poly.list[[i]] <- lm(nox ~ poly(dis, degree = i), data = boston.subset)
}
Plot.
pred.name <- paste0('nox_pred', str_pad(1:10, 2, side = 'left', pad = 0))
boston.poly.list.pred <- boston.subset
for (i in 1:10) {
boston.poly.list.pred[pred.name[i]] <- boston.poly.list[[i]]$fitted.values
}
boston.poly.list.plot <- plot_ly(data = boston.poly.list.pred, x = ~dis) %>%
add_trace(y = ~nox, name = 'nox', type = 'scatter', mode = 'markers') %>%
add_trace(y = ~get(pred.name[1]), name = pred.name[1],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[2]), name = pred.name[2],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[3]), name = pred.name[3],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[4]), name = pred.name[4],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[5]), name = pred.name[5],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[6]), name = pred.name[6],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[7]), name = pred.name[7],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[8]), name = pred.name[8],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[9]), name = pred.name[9],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[10]), name = pred.name[10],
type = 'scatter', mode = 'lines') %>%
layout(
title = 'Polynomial Fits of Different Degrees'
)
boston.poly.list.plot
Residual sum of squares.
boston.poly.list.res <- boston.subset
for (i in 1:10) {
boston.poly.list.res[paste0('nox_res', str_pad(i, 2, side = 'left', pad = 0))] <- boston.poly.list[[i]]$residuals
}
boston.poly.list.rss <- boston.poly.list.res %>%
select(starts_with('nox_')) %>%
summarise_all(function(x) {sum(x^2)}) %>%
as.data.table() %>%
melt(variable.name = 'poly', value.name = 'rss')
boston.poly.list.rss
## poly rss
## 1: nox_res01 2.768563
## 2: nox_res02 2.035262
## 3: nox_res03 1.934107
## 4: nox_res04 1.932981
## 5: nox_res05 1.915290
## 6: nox_res06 1.878257
## 7: nox_res07 1.849484
## 8: nox_res08 1.835630
## 9: nox_res09 1.833331
## 10: nox_res10 1.832171
Split data set to 10-fold.
set.seed(0803)
folds.index <- createFolds(1:nrow(boston.subset), k = 10)
boston.poly.rss.degrees <- c()
for (i in 1:10) {
rss.folds <- c()
for (j in 1:10) {
cv.train <- boston.subset %>%
filter(!(row_number() %in% folds.index[[j]]))
cv.valid <- boston.subset %>%
filter(row_number() %in% folds.index[[j]])
poly.cv <- lm(nox ~ poly(dis, degree = i), data = cv.train)
rss <- sum((cv.valid$nox - predict(poly.cv, newdata = cv.valid['dis'])) ^ 2)
rss.folds <- append(rss.folds, rss)
}
boston.poly.rss.degrees <- append(boston.poly.rss.degrees, sum(rss.folds))
}
boston.poly.cv.rss <- data.frame(poly = pred.name,
rss = boston.poly.rss.degrees)
boston.poly.cv.rss
## poly rss
## 1 nox_pred01 2.788914
## 2 nox_pred02 2.060030
## 3 nox_pred03 1.959390
## 4 nox_pred04 1.972514
## 5 nox_pred05 2.131732
## 6 nox_pred06 2.621185
## 7 nox_pred07 5.269533
## 8 nox_pred08 2.893226
## 9 nox_pred09 3.863362
## 10 nox_pred10 2.733447
When degree = 3, the polynomial regression has the minimal CV RSS, so the optimal degree is 3.
Regression spline with 4 degrees of freedom.
(boston.sp.knots <- attr(ns(boston.subset$dis, df = 4), 'knots'))
## 25% 50% 75%
## 2.100175 3.207450 5.188425
boston.sp <- lm(nox ~ bs(dis, df = 4, knots = boston.sp.knots, degree = 3), data = boston.subset)
summary(boston.sp)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = boston.sp.knots, degree = 3),
## data = boston.subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.128538 -0.037813 -0.009987 0.022644 0.195494
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 0.65622 0.02370
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)1 0.10222 0.03516
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)2 -0.02963 0.02338
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)3 -0.15959 0.02791
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)4 -0.22815 0.03324
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)5 -0.26272 0.04930
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)6 -0.24002 0.05434
## t value Pr(>|t|)
## (Intercept) 27.689 < 2e-16 ***
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)1 2.907 0.00381 **
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)2 -1.267 0.20571
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)3 -5.718 1.86e-08 ***
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)4 -6.864 1.99e-11 ***
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)5 -5.329 1.50e-07 ***
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)6 -4.417 1.23e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06062 on 499 degrees of freedom
## Multiple R-squared: 0.7295, Adjusted R-squared: 0.7263
## F-statistic: 224.3 on 6 and 499 DF, p-value: < 2.2e-16
Plot.
boston.sp.pred <- boston.subset %>%
mutate(nox_pred = boston.sp$fitted.values)
boston.sp.plot <- plot_ly(data = boston.sp.pred, x = ~dis) %>%
add_trace(y = ~nox, name = 'True', type = 'scatter', mode = 'markers') %>%
add_trace(y = ~nox_pred, name = 'Fit', type = 'scatter', mode = 'lines') %>%
layout(
title = 'Spline Fits'
)
boston.sp.plot
Fit.
boston.sp.knots.list <- list()
boston.sp.list <- list()
for (i in 1:10) {
boston.sp.knots.list[[i]] <- attr(ns(boston.subset$dis, df = i), 'knots')
boston.sp.list[[i]] <- lm(nox ~ bs(dis, df = i, knots = boston.sp.knots.list[[i]], degree = 3), data = boston.subset)
}
Plot.
pred.name <- paste0('nox_pred', str_pad(1:10, 2, side = 'left', pad = 0))
boston.sp.list.pred <- boston.subset
for (i in 1:10) {
boston.sp.list.pred[pred.name[i]] <- boston.sp.list[[i]]$fitted.values
}
boston.sp.list.plot <- plot_ly(data = boston.sp.list.pred, x = ~dis) %>%
add_trace(y = ~nox, name = 'nox', type = 'scatter', mode = 'markers') %>%
add_trace(y = ~get(pred.name[1]), name = pred.name[1],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[2]), name = pred.name[2],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[3]), name = pred.name[3],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[4]), name = pred.name[4],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[5]), name = pred.name[5],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[6]), name = pred.name[6],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[7]), name = pred.name[7],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[8]), name = pred.name[8],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[9]), name = pred.name[9],
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~get(pred.name[10]), name = pred.name[10],
type = 'scatter', mode = 'lines') %>%
layout(
title = 'Spline Fits of Different Degrees of Freedom'
)
boston.sp.list.plot
Residual sum of squares.
boston.sp.list.res <- boston.subset
for (i in 1:10) {
boston.sp.list.res[paste0('nox_res', str_pad(i, 2, side = 'left', pad = 0))] <- boston.sp.list[[i]]$residuals
}
boston.sp.list.rss <- boston.sp.list.res %>%
select(starts_with('nox_')) %>%
summarise_all(function(x) {sum(x^2)}) %>%
as.data.table() %>%
melt(variable.name = 'sp', value.name = 'rss')
boston.sp.list.rss
## sp rss
## 1: nox_res01 1.934107
## 2: nox_res02 1.922775
## 3: nox_res03 1.840173
## 4: nox_res04 1.833966
## 5: nox_res05 1.829884
## 6: nox_res06 1.816995
## 7: nox_res07 1.825653
## 8: nox_res08 1.792535
## 9: nox_res09 1.796992
## 10: nox_res10 1.788999
Split data set to 10-fold.
set.seed(0803)
folds.index <- createFolds(1:nrow(boston.subset), k = 10)
boston.sp.rss.degrees <- c()
for (i in 1:10) {
rss.folds <- c()
for (j in 1:10) {
cv.train <- boston.subset %>%
filter(!(row_number() %in% folds.index[[j]]))
cv.valid <- boston.subset %>%
filter(row_number() %in% folds.index[[j]])
knots <- attr(ns(cv.train$dis, df = i), 'knots')
sp.cv <- lm(nox ~ bs(dis, df = i, knots = knots, degree = 3), data = cv.train)
rss <- sum((cv.valid$nox - predict(sp.cv, newdata = cv.valid['dis'])) ^ 2)
rss.folds <- append(rss.folds, rss)
}
boston.sp.rss.degrees <- append(boston.sp.rss.degrees, sum(rss.folds))
}
boston.sp.cv.rss <- data.frame(spline = pred.name,
rss = boston.sp.rss.degrees)
boston.sp.cv.rss
## spline rss
## 1 nox_pred01 1.959390
## 2 nox_pred02 1.974652
## 3 nox_pred03 1.874106
## 4 nox_pred04 1.875422
## 5 nox_pred05 1.874337
## 6 nox_pred06 1.864625
## 7 nox_pred07 1.876487
## 8 nox_pred08 1.863335
## 9 nox_pred09 1.874418
## 10 nox_pred10 1.865125
When degree of freedom is 8, the regression spline has the minimal CV RSS, so the optimal degree of freedom is 8.